{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright 2020 NVIDIA Corporation. All Rights Reserved.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "# =============================================================================="
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NVTabular / HugeCTR Criteo Example \n",
    "Here we'll show how to use NVTabular first as a preprocessing library to prepare the [Criteo Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge) dataset, and then train a model using HugeCTR.\n",
    "\n",
    "### Data Prep\n",
    "Before we get started, make sure you've run the [optimize_criteo notebook](../optimize_criteo.ipynb), which will convert the tsv data published by Criteo into the parquet format that our accelerated readers prefer. It's fair to mention at this point that that notebook will take ~30 minutes to run. While we're hoping to release accelerated csv readers in the near future, we also believe that inefficiencies in existing data representations like csv are in no small part a consequence of inefficiencies in the existing hardware/software stack. Accelerating these pipelines on new hardware like GPUs may require us to make new choices about the representations we use to store that data, and parquet represents a strong alternative."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Standard Libraries\n",
    "import os\n",
    "from time import time\n",
    "import re\n",
    "import shutil\n",
    "import glob\n",
    "import warnings\n",
    "\n",
    "# External Dependencies\n",
    "import numpy as np\n",
    "import cupy as cp\n",
    "import cudf\n",
    "import dask_cudf\n",
    "from dask_cuda import LocalCUDACluster\n",
    "from dask.distributed import Client\n",
    "from dask.utils import parse_bytes\n",
    "from dask.delayed import delayed\n",
    "import rmm\n",
    "\n",
    "# NVTabular\n",
    "import nvtabular as nvt\n",
    "import nvtabular.ops as ops\n",
    "from nvtabular.io import Shuffle\n",
    "from nvtabular.utils import _pynvml_mem_size, device_mem_size\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dataset and Dataset Schema\n",
    "Once our data is ready, we'll define some high level parameters to describe where our data is and what it \"looks like\" at a high level."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define some information about where to get our data\n",
    "BASE_DIR = \"/raid/criteo/tests/\"\n",
    "input_path = os.path.join(BASE_DIR, \"crit_int_pq\")\n",
    "dask_workdir = os.path.join(BASE_DIR, \"test_dask/workdir\")\n",
    "output_path = os.path.join(BASE_DIR, \"test_dask/output\")\n",
    "stats_path = os.path.join(BASE_DIR, \"test_dask/stats\")\n",
    "\n",
    "\n",
    "#BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 800000))\n",
    "#NUM_PARTS = int(os.environ.get('NUM_PARTS', 2))\n",
    "NUM_TRAIN_DAYS = 23 # number of days worth of data to use for training, the rest will be used for validation\n",
    "\n",
    "# define our dataset schema\n",
    "CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]\n",
    "CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]\n",
    "LABEL_COLUMNS = ['label']\n",
    "COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS\n",
    "\n",
    "# Make sure we have a clean worker space for Dask\n",
    "if os.path.isdir(dask_workdir):\n",
    "    shutil.rmtree(dask_workdir)\n",
    "os.makedirs(dask_workdir)\n",
    "\n",
    "# Make sure we have a clean stats space for Dask\n",
    "if os.path.isdir(stats_path):\n",
    "    shutil.rmtree(stats_path)\n",
    "os.mkdir(stats_path)\n",
    "         \n",
    "# Make sure we have a clean output path\n",
    "if os.path.isdir(output_path):\n",
    "    shutil.rmtree(output_path)\n",
    "os.mkdir(output_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NVTabular  crit_int_pq\tcriteo_output  test_dask  trash\n"
     ]
    }
   ],
   "source": [
    "! ls $BASE_DIR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['/raid/criteo/tests/crit_int_pq/day_0.parquet', '/raid/criteo/tests/crit_int_pq/day_1.parquet', '/raid/criteo/tests/crit_int_pq/day_2.parquet', '/raid/criteo/tests/crit_int_pq/day_3.parquet', '/raid/criteo/tests/crit_int_pq/day_4.parquet', '/raid/criteo/tests/crit_int_pq/day_5.parquet', '/raid/criteo/tests/crit_int_pq/day_6.parquet', '/raid/criteo/tests/crit_int_pq/day_7.parquet', '/raid/criteo/tests/crit_int_pq/day_8.parquet', '/raid/criteo/tests/crit_int_pq/day_9.parquet', '/raid/criteo/tests/crit_int_pq/day_10.parquet', '/raid/criteo/tests/crit_int_pq/day_11.parquet', '/raid/criteo/tests/crit_int_pq/day_12.parquet', '/raid/criteo/tests/crit_int_pq/day_13.parquet', '/raid/criteo/tests/crit_int_pq/day_14.parquet', '/raid/criteo/tests/crit_int_pq/day_15.parquet', '/raid/criteo/tests/crit_int_pq/day_16.parquet', '/raid/criteo/tests/crit_int_pq/day_17.parquet', '/raid/criteo/tests/crit_int_pq/day_18.parquet', '/raid/criteo/tests/crit_int_pq/day_19.parquet', '/raid/criteo/tests/crit_int_pq/day_20.parquet', '/raid/criteo/tests/crit_int_pq/day_21.parquet', '/raid/criteo/tests/crit_int_pq/day_22.parquet']\n",
      "['/raid/criteo/tests/crit_int_pq/day_23.parquet']\n"
     ]
    }
   ],
   "source": [
    "fname = 'day_{}.parquet'\n",
    "num_days = len([i for i in os.listdir(input_path) if re.match(fname.format('[0-9]{1,2}'), i) is not None])\n",
    "train_paths = [os.path.join(input_path, fname.format(day)) for day in range(NUM_TRAIN_DAYS)]\n",
    "valid_paths = [os.path.join(input_path, fname.format(day)) for day in range(NUM_TRAIN_DAYS, num_days)]\n",
    "print(train_paths)\n",
    "print(valid_paths)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Deploy a Distributed-Dask Cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table style=\"border: 2px solid white;\">\n",
       "<tr>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Client</h3>\n",
       "<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
       "  <li><b>Scheduler: </b>tcp://127.0.0.1:41653</li>\n",
       "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a></li>\n",
       "</ul>\n",
       "</td>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Cluster</h3>\n",
       "<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
       "  <li><b>Workers: </b>8</li>\n",
       "  <li><b>Cores: </b>8</li>\n",
       "  <li><b>Memory: </b>540.94 GB</li>\n",
       "</ul>\n",
       "</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<Client: 'tcp://127.0.0.1:41653' processes=8 threads=8, memory=540.94 GB>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Dask dashboard\n",
    "dashboard_port = \"8787\"\n",
    "\n",
    "# Config for 8xV100-32GB\n",
    "# Deploy a Single-Machine Multi-GPU Cluster\n",
    "protocol = \"tcp\"             # \"tcp\" or \"ucx\"\n",
    "visible_devices = \"0,1,2,3,4,5,6,7\"  # Delect devices to place workers\n",
    "device_limit_frac = 0.8      # Spill GPU-Worker memory to host at this limit.\n",
    "device_pool_frac = 0.9\n",
    "part_mem_frac = 0.1\n",
    "\n",
    "# Use total device size to calculate args.device_limit_frac\n",
    "device_size = device_mem_size(kind=\"total\")\n",
    "device_limit = int(device_limit_frac * device_size)\n",
    "device_pool_size = int(device_pool_frac * device_size)\n",
    "part_size = int(part_mem_frac * device_size)\n",
    "\n",
    "# Check if any device memory is already occupied\n",
    "for dev in visible_devices.split(\",\"):\n",
    "    fmem = _pynvml_mem_size(kind=\"free\", index=int(dev))\n",
    "    used = (device_size - fmem) / 1e9\n",
    "    if used > 1.0:\n",
    "        warnings.warn(f\"BEWARE - {used} GB is already occupied on device {int(dev)}!\")\n",
    "\n",
    "cluster = None               # (Optional) Specify existing scheduler port\n",
    "if cluster is None:\n",
    "    cluster = LocalCUDACluster(\n",
    "        protocol = protocol,\n",
    "        n_workers=len(visible_devices.split(\",\")),\n",
    "        CUDA_VISIBLE_DEVICES = visible_devices,\n",
    "        device_memory_limit = device_limit,\n",
    "        local_directory=dask_workdir,\n",
    "        dashboard_address=\":\" + dashboard_port,\n",
    "    )\n",
    "\n",
    "# Create the distributed client\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Initilize Memory Pools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'tcp://127.0.0.1:33597': None,\n",
       " 'tcp://127.0.0.1:37127': None,\n",
       " 'tcp://127.0.0.1:37983': None,\n",
       " 'tcp://127.0.0.1:38013': None,\n",
       " 'tcp://127.0.0.1:40809': None,\n",
       " 'tcp://127.0.0.1:41923': None,\n",
       " 'tcp://127.0.0.1:44193': None,\n",
       " 'tcp://127.0.0.1:45941': None}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Initialize RMM pool on ALL workers\n",
    "def _rmm_pool():\n",
    "    rmm.reinitialize(\n",
    "        pool_allocator=True,\n",
    "        initial_pool_size=device_pool_size, # Use default size\n",
    "    )\n",
    "    \n",
    "client.run(_rmm_pool)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preprocessing\n",
    "At this point, our data still isn't in a form that's ideal for consumption by neural networks. The most pressing issues are missing values and the fact that our categorical variables are still represented by random, discrete identifiers, and need to be transformed into contiguous indices that can be leveraged by a learned embedding. Less pressing, but still important for learning dynamics, are the distributions of our continuous variables, which are distributed across multiple orders of magnitude and are uncentered (i.e. E[x] != 0).\n",
    "\n",
    "We can fix these issues in a conscise and GPU-accelerated manner with an NVTabular `Workflow`. We'll instantiate one with our current dataset schema, then symbolically add operations _on_ that schema. By setting all these `Ops` to use `replace=True`, the schema itself will remain unmodified, while the variables represented by each field in the schema will be transformed.\n",
    "\n",
    "#### Frequency Thresholding\n",
    "One interesting thing worth pointing out is that we're using _frequency thresholding_ in our `Categorify` op. This handy functionality will map all categories which occur in the dataset with some threshold level of infrequency (which we've set here to be 15 occurrences throughout the dataset) to the _same_ index, keeping the model from overfitting to sparse signals."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "proc = nvt.Workflow(\n",
    "    cat_names=CATEGORICAL_COLUMNS,\n",
    "    cont_names=CONTINUOUS_COLUMNS,\n",
    "    label_name=LABEL_COLUMNS,\n",
    "    client = client)\n",
    "\n",
    "# log -> normalize continuous features. Note that doing this in the opposite\n",
    "# order wouldn't make sense! Note also that we're zero filling continuous\n",
    "# values before the log: this is a good time to remember that LogOp\n",
    "# performs log(1+x), not log(x)\n",
    "proc.add_cont_feature([ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()])\n",
    "\n",
    "# categorification w/ MOD 10M\n",
    "proc.add_cat_preprocess([ops.Categorify(dtype = np.int64, out_path=stats_path), ops.HashBucket(10000000)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now instantiate dataset iterators to loop through our dataset (which we couldn't fit into GPU memory). We need to enforce the required HugeCTR data types, so we set them in a dictionary and give as an argument when creating our dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_dtypes={}\n",
    "\n",
    "for col in CONTINUOUS_COLUMNS:\n",
    "    dict_dtypes[col] = np.float32\n",
    "    \n",
    "for col in LABEL_COLUMNS:\n",
    "    dict_dtypes[col] = np.float32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size, dtypes=dict_dtypes)\n",
    "valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_size=part_size, dtypes=dict_dtypes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now run them through our workflows to collect statistics on the train set, then transform and save to parquet files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_train_dir = os.path.join(output_path, 'train/')\n",
    "output_valid_dir = os.path.join(output_path, 'valid/')\n",
    "! mkdir -p $output_train_dir\n",
    "! mkdir -p $output_valid_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For reference, let's time it to see how long it takes..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 10s, sys: 6.24 s, total: 1min 16s\n",
      "Wall time: 6min 9s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "proc.apply(train_dataset, shuffle=nvt.io.Shuffle.PER_PARTITION, output_format=\"parquet\", output_path=output_train_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.13 s, sys: 636 ms, total: 4.76 s\n",
      "Wall time: 1min 3s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "proc.apply(valid_dataset, record_stats=False, shuffle=nvt.io.Shuffle.PER_PARTITION, output_format=\"parquet\", output_path=output_valid_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_values([(220817330, 16), (126535808, 16), (3014529, 16), (400781, 16), (11, 6), (2209, 16), (11869, 16), (148, 16), (4, 3), (977, 16), (15, 7), (38713, 16), (283898298, 16), (39644599, 16), (181767044, 16), (584616, 16), (12883, 16), (109, 16), (37, 12), (17177, 16), (7425, 16), (20266, 16), (4, 3), (7085, 16), (1535, 16), (64, 16)])\n"
     ]
    }
   ],
   "source": [
    "embeddings = ops.get_embedding_sizes(proc)\n",
    "print(embeddings.values())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And just like that, we have training and validation sets ready to feed to a model!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## HugeCTR\n",
    "### Training\n",
    "We'll run huge_ctr using the DLRM configuration file.\n",
    "\n",
    "First, we'll reinitialize our memory pool from earlier to free up some memory so that we can share it with HugeCTR."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "rmm.reinitialize(pool_allocator=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, we run HugeCTR. For reference, let's time it to see how long it takes..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.001, init_start, ]\n",
      "HugeCTR Version: 2.2.1\n",
      "Config file: dlrm_fp32_64k.json\n",
      "[06d21h44m47s][HUGECTR][INFO]: batchsize_eval is not specified using default: 65536\n",
      "[06d21h44m47s][HUGECTR][INFO]: algorithm_search is not specified using default: 1\n",
      "[06d21h44m47s][HUGECTR][INFO]: Algorithm search: ON\n",
      "[06d21h44m47s][HUGECTR][INFO]: cuda_graph is not specified using default: 1\n",
      "[06d21h44m47s][HUGECTR][INFO]: CUDA Graph: ON\n",
      "[06d21h44m47s][HUGECTR][INFO]: Initial seed is 4348382\n",
      "[06d21h45m01s][HUGECTR][INFO]: Peer-to-peer access cannot be fully enabled.\n",
      "Device 0: Tesla V100-SXM2-32GB\n",
      "Device 1: Tesla V100-SXM2-32GB\n",
      "Device 2: Tesla V100-SXM2-32GB\n",
      "Device 3: Tesla V100-SXM2-32GB\n",
      "Device 4: Tesla V100-SXM2-32GB\n",
      "Device 5: Tesla V100-SXM2-32GB\n",
      "Device 6: Tesla V100-SXM2-32GB\n",
      "Device 7: Tesla V100-SXM2-32GB\n",
      "[06d21h45m01s][HUGECTR][INFO]: cache_eval_data is not specified using default: 0\n",
      "[06d21h45m02s][HUGECTR][INFO]: num_internal_buffers 1\n",
      "[06d21h45m03s][HUGECTR][INFO]: num_internal_buffers 1\n",
      "[06d21h45m03s][HUGECTR][INFO]: Vocabulary size: 856783536\n",
      "[06d21h45m03s][HUGECTR][INFO]: max_vocabulary_size_per_gpu_=15500000\n",
      "[06d21h45m03s][HUGECTR][INFO]: All2All Warmup Start\n",
      "[06d21h45m04s][HUGECTR][INFO]: All2All Warmup End\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu0 start to init embedding\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu1 start to init embedding\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu2 start to init embedding\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu3 start to init embedding\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu4 start to init embedding\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu5 start to init embedding\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu6 start to init embedding\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu7 start to init embedding\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu0 init embedding done\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu1 init embedding done\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu2 init embedding done\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu3 init embedding done\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu4 init embedding done\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu5 init embedding done\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu6 init embedding done\n",
      "[06d21h45m12s][HUGECTR][INFO]: gpu7 init embedding done\n",
      "[06d21h45m12s][HUGECTR][INFO]: decay_power is not specified using default: 2.000000\n",
      "[06d21h45m12s][HUGECTR][INFO]: end_lr is not specified using default: 0.000000\n",
      "[26285, init_end, ]\n",
      "[26285, run_start, ]\n",
      "HugeCTR training start:\n",
      "[26285, train_epoch_start, 0, ]\n",
      "[06d21h46m04s][HUGECTR][INFO]: Iter: 1000 Time(1000 iters): 52.105291s Loss: 0.131518 lr:3.003000\n",
      "[06d21h46m56s][HUGECTR][INFO]: Iter: 2000 Time(1000 iters): 51.121085s Loss: 0.133981 lr:6.003000\n",
      "[06d21h47m46s][HUGECTR][INFO]: Iter: 3000 Time(1000 iters): 50.617595s Loss: 0.132952 lr:9.003000\n",
      "[190298, eval_start, 0.0499898, ]\n",
      "[06d21h48m55s][HUGECTR][INFO]: Evaluation, AUC: 0.714515\n",
      "[249428, eval_accuracy, 0.714515, 0.0499898, 3200, ]\n",
      "[06d21h48m55s][HUGECTR][INFO]: Eval Time for 2720 iters: 59.130286s\n",
      "[249428, eval_stop, 0.0499898, ]\n",
      "[06d21h49m36s][HUGECTR][INFO]: Iter: 4000 Time(1000 iters): 110.215660s Loss: 0.134619 lr:12.003000\n",
      "[06d21h50m27s][HUGECTR][INFO]: Iter: 5000 Time(1000 iters): 50.412671s Loss: 0.138229 lr:15.003000\n",
      "[06d21h51m18s][HUGECTR][INFO]: Iter: 6000 Time(1000 iters): 50.787389s Loss: 0.132772 lr:18.003000\n",
      "[411860, eval_start, 0.0999797, ]\n",
      "[06d21h52m36s][HUGECTR][INFO]: Evaluation, AUC: 0.725409\n",
      "[469530, eval_accuracy, 0.725409, 0.0999797, 6400, ]\n",
      "[06d21h52m36s][HUGECTR][INFO]: Eval Time for 2720 iters: 57.669482s\n",
      "[469530, eval_stop, 0.0999797, ]\n",
      "[06d21h53m06s][HUGECTR][INFO]: Iter: 7000 Time(1000 iters): 108.330102s Loss: 0.128844 lr:21.003000\n",
      "[06d21h53m56s][HUGECTR][INFO]: Iter: 8000 Time(1000 iters): 50.526227s Loss: 0.142645 lr:24.000000\n",
      "[06d21h54m47s][HUGECTR][INFO]: Iter: 9000 Time(1000 iters): 50.145754s Loss: 0.134447 lr:24.000000\n",
      "[630761, eval_start, 0.14997, ]\n",
      "[06d21h56m15s][HUGECTR][INFO]: Evaluation, AUC: 0.733410\n",
      "[688614, eval_accuracy, 0.73341, 0.14997, 9600, ]\n",
      "[688614, train_samples, 629211136, ]\n",
      "Hit target accuracy AUC 0.732500 at epoch 0.149970 with batchsize: 65536 in 662.33 s. Average speed 949899.08 records/s.\n",
      "[688613.97, eval_stop, 0.149970, ]\n",
      "[688613.98, train_epoch_end, 1, ]\n",
      "[688613.98, run_stop, ]\n",
      "CPU times: user 7.49 s, sys: 1.89 s, total: 9.38 s\n",
      "Wall time: 11min 34s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "! /usr/local/hugectr/bin/huge_ctr --train dlrm_fp32_64k.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "file_extension": ".py",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  },
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
